This analysis follows Jake VanderPlas's blog post: http://jakevdp.github.io/blog/2015/07/23/learning-seattles-work-habits-from-bicycle-counts/
In [1]:
%matplotlib notebook
In [2]:
#Load mobi daily data
data = pd.read_pickle('taken_hourly_df.p')
#data = data['2017-06':'2017-09']
data = pd.DataFrame(data.sum(1))
data.loc['2017-06-24'] = np.nan
data.loc['2017-06-25'] = np.nan
In [3]:
data.tail()
Out[3]:
In [4]:
data['2017-06-29']
Out[4]:
In [5]:
f,ax = plt.subplots()
data.loc['2017-06-28':'2017-07-02'].sum(1).plot()
ax.set_ylabel("Bike/hour")
ax.set_xlabel("")
#f.savefig('daydata2017-06-30.png')
Out[5]:
In [6]:
f,ax = plt.subplots()
ax = data.sum(1).plot(kind='line')
ax.set_ylabel('Bike/hour')
ax.set_xlabel('')
f.savefig('hourly_usage_may-sep.png')
In [7]:
f,ax = plt.subplots()
data.groupby(pd.TimeGrouper(freq='D')).sum().sum(1).plot()
Out[7]:
In [8]:
def plotweek(startdate,enddate):
weekdata = data[startdate:enddate]
f,ax = plt.subplots()
ax = weekdata.sum(1).plot()
ax.set_ylabel('Bike/hour')
ax.set_xlabel('')
f.savefig('weekdata-{}-{}.png'.format(startdate,enddate))
return ax
In [9]:
plotweek('2017-07-31','2017-08-06')
Out[9]:
In [10]:
plotweek('2017-08-07','2017-08-13')
Out[10]:
In [11]:
pivoted = pd.DataFrame(data).pivot_table(
index=data.index.date,
columns=data.index.hour,
fill_value=0)
In [12]:
pivoted.head()
Out[12]:
Extract raw values into a numpy 2d array
In [14]:
X = pivoted.values
print(X.sum())
X.shape
Out[14]:
In [15]:
%matplotlib notebook
In [16]:
from sklearn.decomposition import PCA
Xpca = PCA(0.9).fit_transform(X)
Xpca.shape
Out[16]:
In [17]:
dates = list(pivoted.index.values)
dates = np.array([x.strftime("%m %d") for x in dates])
dates.shape
Out[17]:
In [18]:
total_trips = X.sum(1)
f,ax = plt.subplots()
scat = ax.scatter(Xpca[:, 0], Xpca[:, 1], c=total_trips, cmap='cool',picker=0.5)
ax.get_xaxis().set_ticks([])
ax.get_yaxis().set_ticks([])
f.colorbar(scat,label='total trips');
f.savefig('PCA_numtrips.png')
In [19]:
f2,ax2 = plt.subplots()
dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);
f2.savefig('PCA_dayofweek.png')
In [26]:
f2,ax2 = plt.subplots()
dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);
for date,x,y,dow in zip(dates,Xpca[:,0],Xpca[:,1],dayofweek):
if dow > 4 and x>15:
ax2.annotate(date,(x,y),xytext=(-20,-60),textcoords='offset points',arrowprops=dict(facecolor='black', headwidth=10, alpha=0.4, width=1))
f2.savefig('PCA_dayofweek_lowvolweekends.png')
In [27]:
f2,ax2 = plt.subplots()
dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=dayofweek,
cmap=plt.cm.get_cmap('jet', 7))
cb = f2.colorbar(scat2,ticks=range(7))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])
#f2.clim(-0.5, 6.5);
for date,x,y,dow in zip(dates,Xpca[:,0],Xpca[:,1],dayofweek):
if dow == 0 and y>20:
ax2.annotate(date,(x,y),xytext=(20,-60),textcoords='offset points',arrowprops=dict(facecolor='black', headwidth=10, alpha=0.4, width=1))
f2.savefig('PCA_dayofweek_mondays.png')
In [28]:
month = list(pivoted.index.values)
month = np.array([int(x.strftime("%m")) for x in month])
month;
In [29]:
f2,ax2 = plt.subplots()
dayofweek = pd.to_datetime(pivoted.index).dayofweek
scat2 = ax2.scatter(Xpca[:, 0], Xpca[:, 1], c=month,
cmap=plt.cm.get_cmap('jet', 4))
cb = f2.colorbar(scat2,ticks=range(6,10))
ax2.get_xaxis().set_ticks([])
ax2.get_yaxis().set_ticks([])
cb.set_ticklabels(['Jun', 'Jul', 'Aug', 'Sep',''])
#f2.clim(-0.5, 6.5);
In [30]:
dayofweek.shape
Out[30]:
In [31]:
pivoted.index[-1]
Out[31]:
In [32]:
pd.to_datetime(pivoted.index[-1]).dayofweek
Out[32]:
In [34]:
f3,ax3 = plt.subplots()
# Updates GaussianMixture from JVP's blog
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(Xpca)
cluster_label = gmm.predict(Xpca)
ax3.get_xaxis().set_ticks([])
ax3.get_yaxis().set_ticks([])
ax3.scatter(Xpca[:,0], Xpca[:,1],c=cluster_label)
f3.savefig('PCA_clustering.png')
In [ ]: